In [ ]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 100

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sb

import json

import networkx as nx

import statsmodels.formula.api as smf

In [ ]:
state_codings = {1: "Alabama", 2: "Alaska", 4: "Arizona", 5: "Arkansas", 6: "California", 
 8: "Colorado", 9: "Connecticut", 10: "Delaware", 11: "District Of Columbia",
 12: "Florida", 13: "Georgia", 15: "Hawaii", 16: "Idaho", 17: "Illinois", 
 18: "Indiana", 19: "Iowa", 20: "Kansas", 21: "Kentucky", 22: "Louisiana", 
 23: "Maine", 24: "Maryland", 25: "Massachusetts", 26: "Michigan", 
 27: "Minnesota", 28: "Mississippi", 29: "Missouri", 30: "Montana", 
 31: "Nebraska", 32: "Nevada", 33: "New Hampshire", 34: "New Jersey", 
 35: "New Mexico", 36: "New York", 37: "North Carolina", 38: "North Dakota", 
 39: "Ohio", 40: "Oklahoma", 41: "Oregon", 42: "Pennsylvania", 
 43: "Puerto Rico", 44: "Rhode Island", 45: "South Carolina", 
 46: "South Dakota", 47: "Tennessee", 48: "Texas", 49: "Utah", 
 50: "Vermont", 52: "Virgin Islands", 51: "Virginia", 53: "Washington", 
 54: "West Virginia", 55: "Wisconsin", 56: "Wyoming"}

In [ ]:
counts_df = pd.read_csv('accident_counts.csv',encoding='utf8')
counts_df['WEEKDAY'] = pd.to_datetime(counts_df[['YEAR','MONTH','DAY']]).apply(lambda x:x.weekday())
print('{0:,} rows'.format(len(counts_df)))
counts_df.head()

population_estimates = pd.read_csv('population_estimates.csv',encoding='utf8',index_col=0)

Exploratory data analysis


In [ ]:
sb.factorplot(x='HOUR',y='ST_CASE',hue='WEEKDAY',data=counts_df,
              aspect=2,order=range(24),palette='nipy_spectral',dodge=.5)

In [ ]:
sb.factorplot(x='MONTH',y='ST_CASE',hue='WEEKDAY',data=counts_df,
              aspect=2,order=range(1,13),palette='nipy_spectral',dodge=.5)

Changes in fatal accidents following legalization?

One interesting source of exogenous variation Colorado and Washington's legalization of cannabis in 2014. If cannabis usage increased following legalization and this translated into more impaired driving, then there should be an increase in the number of fatal auto accidents in these states after 2014.


In [ ]:
annual_state_counts_df = counts_df.groupby(['STATE','YEAR']).agg({'ST_CASE':np.sum,'DRUNK_DR':np.sum,'FATALS':np.sum}).reset_index()
annual_state_counts_df = pd.merge(annual_state_counts_df,population_estimates,
                                  left_on=['STATE','YEAR'],right_on=['State','Year']
                                 )
annual_state_counts_df = annual_state_counts_df[['STATE','YEAR','ST_CASE','DRUNK_DR','FATALS','Population']]
annual_state_counts_df.head()

Visualize accident statistics for Colorado, Washington, New Mexico (similar to Colorado), and Oregon (similar to Washington).


In [ ]:
_cols = ['ST_CASE','DRUNK_DR','FATALS']
annual_co_counts = annual_state_counts_df[(annual_state_counts_df['STATE'] == "Colorado") & (annual_state_counts_df['YEAR'] > 2010)].set_index('YEAR')[_cols]
annual_wa_counts = annual_state_counts_df[(annual_state_counts_df['STATE'] == "Washington") & (annual_state_counts_df['YEAR'] > 2010)].set_index('YEAR')[_cols]
annual_nm_counts = annual_state_counts_df[(annual_state_counts_df['STATE'] == "New Mexico") & (annual_state_counts_df['YEAR'] > 2010)].set_index('YEAR')[_cols]
annual_or_counts = annual_state_counts_df[(annual_state_counts_df['STATE'] == "Oregon") & (annual_state_counts_df['YEAR'] > 2010)].set_index('YEAR')[_cols]

# Make the figures
f,axs = plt.subplots(3,1,figsize=(10,6),sharex=True)

# Plot the cases
annual_co_counts.plot.line(y='ST_CASE',c='blue',ax=axs[0],legend=False,lw=3)
annual_wa_counts.plot.line(y='ST_CASE',c='green',ax=axs[0],legend=False,lw=3)
annual_nm_counts.plot.line(y='ST_CASE',c='red',ls='--',ax=axs[0],legend=False,lw=3)
annual_or_counts.plot.line(y='ST_CASE',c='orange',ls='--',ax=axs[0],legend=False,lw=3)
axs[0].set_ylabel('Fatal Incidents')

# Plot the drunk driving cases
annual_co_counts.plot.line(y='DRUNK_DR',c='blue',ax=axs[1],legend=False,lw=3)
annual_wa_counts.plot.line(y='DRUNK_DR',c='green',ax=axs[1],legend=False,lw=3)
annual_nm_counts.plot.line(y='DRUNK_DR',c='red',ls='--',ax=axs[1],legend=False,lw=3)
annual_or_counts.plot.line(y='DRUNK_DR',c='orange',ls='--',ax=axs[1],legend=False,lw=3)
axs[1].set_ylabel('Drunk Driving')

# Plot the fatalities
annual_co_counts.plot.line(y='FATALS',c='blue',ax=axs[2],legend=False,lw=3)
annual_wa_counts.plot.line(y='FATALS',c='green',ax=axs[2],legend=False,lw=3)
annual_nm_counts.plot.line(y='FATALS',c='red',ls='--',ax=axs[2],legend=False,lw=3)
annual_or_counts.plot.line(y='FATALS',c='orange',ls='--',ax=axs[2],legend=False,lw=3)
axs[2].set_ylabel('Total Fatalities')

# Plot 2014 legalization
for ax in axs:
    ax.axvline(x=2014,c='r')

# Stuff for legend
b = mlines.Line2D([],[],color='blue',label='Colorado',linewidth=3)
g = mlines.Line2D([],[],color='green',label='Washington',linewidth=3)
r = mlines.Line2D([],[],color='red',linestyle='--',label='New Mexico',linewidth=3)
o = mlines.Line2D([],[],color='orange',linestyle='--',label='Oregon',linewidth=3)
axs[2].legend(loc='lower center',ncol=4,handles=[b,g,r,o],fontsize=12,bbox_to_anchor=(.5,-.75))

f.tight_layout()

In [ ]:
annual_state_counts_df['Treated'] = np.where(annual_state_counts_df['STATE'].isin(['Colorado','Washington']),1,0)
annual_state_counts_df['Time'] = np.where(annual_state_counts_df['YEAR'] >= 2014,1,0)
annual_state_counts_df = annual_state_counts_df[annual_state_counts_df['YEAR'] >= 2010]

annual_state_counts_df.query('STATE == "Colorado"')

We'll specify a difference-in-differences design with "Treated" states (who legalized) and "Time" (when they legalized), while controlling for differences in population. The Treated:Time interaction is the Difference-in-Differences estimate, which is not statistically significant. This suggests legalization did not increase the risk of fatal auto accidents in these states.


In [ ]:
m_cases = smf.ols(formula = 'ST_CASE ~ Treated*Time + Population', 
                  data = annual_state_counts_df).fit()
print(m_cases.summary())

In [ ]:
m_cases = smf.ols(formula = 'FATALS ~ Treated*Time + Population', 
                  data = annual_state_counts_df).fit()
print(m_cases.summary())

In [ ]:
m_cases = smf.ols(formula = 'DRUNK_DR ~ Treated*Time + Population', 
                  data = annual_state_counts_df).fit()
print(m_cases.summary())

Changes in fatal accidents on 4/20?

There's another exogenous source of variation in this car crash data we can exploit: the unofficial cannabis enthusiast holiday of April 20. If consumption increases on this day compared to a week before or after (April 13 and 27), does this explain changes in fatal auto accidents?


In [ ]:
counts_df.head()

In [ ]:
population_estimates.head()

In [ ]:
# Select only data after 2004 in the month of April
april_df = counts_df.query('MONTH == 4 & YEAR > 2004').set_index(['STATE','HOUR','MONTH','DAY','YEAR'])

# Re-index the data to fill in missing dates
ix = pd.MultiIndex.from_product([state_codings.values(),range(0,24),[4],range(1,31),range(2005,2017)],
                                names = ['STATE','HOUR','MONTH','DAY','YEAR'])

april_df = april_df.reindex(ix).fillna(0)
april_df.reset_index(inplace=True)

# Add in population data
april_df = pd.merge(april_df,population_estimates,
                    left_on=['STATE','YEAR'],right_on=['State','Year'])
april_df = april_df[[i for i in april_df.columns if i not in ['Year','State']]]

# Inspect
april_df.head()

In [ ]:
# Calculate whether day is a Friday, Saturday, or Sunday
april_df['Weekday'] = pd.to_datetime(april_df[['YEAR','MONTH','DAY']]).apply(lambda x:x.weekday())
april_df['Weekend'] = np.where(april_df['Weekday'] >= 4,1,0)

# Treated days are on April 20
april_df['Fourtwenty'] = np.where(april_df['DAY'] == 20,1,0)
april_df['Legal'] = np.where((april_df['STATE'].isin(['Colorado','Washington'])) & (april_df['YEAR'] >= 2014),1,0)

# Examine data for a week before and after April 20
april_df = april_df[april_df['DAY'].isin([13,20,27])]

# Inspect Colorado data
april_df.query('STATE == "Colorado"').sort_values(['YEAR','DAY'])

Inspect the main effect of cases on April 20 are compared to the week before and after.


In [ ]:
sb.factorplot(x='DAY',y='ST_CASE',data=april_df,kind='bar',palette=['grey','green','grey'])

Estimate the models. The Fourtwenty and Legal dummy variables (and their interaction) capture whether fatal accidents increased on April 20 compared to the week beforehand and afterwards, while controlling for state legality, year, whether it's a weekend, and state population. We do not observe a statistically signidicant increase in the number of incidents, alcohol-involved incidents, and total fatalities on April 20.


In [ ]:
m_cases_420 = smf.ols(formula = 'ST_CASE ~ Fourtwenty*Legal + YEAR + Weekend + Population', 
                  data = april_df).fit()
print(m_cases_420.summary())

In [ ]:
m_drunk_420 = smf.ols(formula = 'DRUNK_DR ~ Fourtwenty*Legal + YEAR + Weekend + Population', 
                  data = april_df).fit()
print(m_drunk_420.summary())

In [ ]:
m_fatal_420 = smf.ols(formula = 'FATALS ~ Fourtwenty*Legal + YEAR + Weekend + Population', 
                  data = april_df).fit()
print(m_fatal_420.summary())

Case study: Wikipedia pageview dynamics

On November 8, 2016, California voters passed Proposition 64 legalizing recreational use of cannabis. On January 1, 2018, recreational sales began. The following two files capture the daily pageview data for the article Cannabis in California as well as the daily pageview for all the other pages it links to.


In [ ]:
ca2016 = pd.read_csv('wikipv_ca_2016.csv',encoding='utf8',parse_dates=['timestamp']).set_index('timestamp')
ca2018 = pd.read_csv('wikipv_ca_2018.csv',encoding='utf8',parse_dates=['timestamp']).set_index('timestamp')
ca2016.head()

Here are two plots of the pageview dynamics for the seed "Cannabis in California" article.


In [ ]:
f,axs = plt.subplots(2,1,figsize=(10,5))
ca2016['Cannabis in California'].plot(ax=axs[0],color='red',lw=3)
ca2018['Cannabis in California'].plot(ax=axs[1],color='blue',lw=3)
axs[0].axvline(pd.Timestamp('2016-11-08'),c='k',ls='--')
axs[1].axvline(pd.Timestamp('2018-01-01'),c='k',ls='--')

for ax in axs:
    ax.set_ylabel('Pageviews')
    
f.tight_layout()

Here is a visualization of the local hyperlink network around "Cannabis in California."


In [ ]:
g = nx.read_gexf('wikilinks_cannabis_in_california.gexf')

f,ax = plt.subplots(1,1,figsize=(10,10))

g_pos = nx.layout.kamada_kawai_layout(g)

nx.draw(G = g,
        ax = ax,
        pos = g_pos,
        with_labels = True,
        node_size = [dc*(len(g) - 1)*10 for dc in nx.degree_centrality(g).values()],
        font_size = 7.5,
        font_weight = 'bold',
        node_color = 'tomato',
        edge_color = 'grey'
       )

What kinds of causal arguments could you make from these pageview data and the hyperlink networks?


In [ ]:

Appendix 1: Cleaning NHTSA FARS Data

"accidents.csv" is a ~450MB file after concatenating the raw annual data from NHTSA FARS.


In [ ]:
all_accident_df = pd.read_csv('accidents.csv',encoding='utf8',index_col=0)
all_accident_df.head()

The state population estimates for 2010-2017 from the U.S. Census Buureau.


In [ ]:
population_estimates = pd.read_csv('census_pop_estimates.csv')

_cols = [i for i in population_estimates.columns if "POPESTIMATE" in i] + ['NAME']
population_estimates_stacked = population_estimates[_cols].set_index('NAME').unstack().reset_index()

population_estimates_stacked.rename(columns={'level_0':'Year','NAME':'State',0:'Population'},inplace=True)
population_estimates_stacked['Year'] = population_estimates_stacked['Year'].str.slice(-4).astype(int)
population_estimates_stacked = population_estimates_stacked[population_estimates_stacked['State'].isin(state_codings.values())]
population_estimates_stacked.dropna(subset=['Population'],inplace=True)
population_estimates_stacked.to_csv('population_estimates.csv',encoding='utf8')

Groupby-aggregate the data by state, month, day, and year counting the number of cases, alcohol-involved deaths, and total fatalities. Save the data as "accident_counts.csv".


In [ ]:
gb_vars = ['STATE','HOUR','DAY','MONTH','YEAR']
agg_dict = {'ST_CASE':len,'DRUNK_DR':np.sum,'FATALS':np.sum}
counts_df = all_accident_df.groupby(gb_vars).agg(agg_dict).reset_index()

counts_df['STATE'] = counts_df['STATE'].map(state_codings)
counts_df = counts_df.query('YEAR > 1999')
counts_df.to_csv('accident_counts.csv',encoding='utf8',index=False)
counts_df.head()

Load libraries and define two functions:

  • get_page_outlinks - Get all of the outlinks from the current version of the page.
  • get_pageviews - Get all of the pageviews for an article over a time range

In [ ]:
from datetime import datetime
import requests, json
from bs4 import BeautifulSoup
from urllib.parse import urlparse, quote, unquote
import networkx as nx

def get_page_outlinks(page_title,lang='en',redirects=1):
    """Takes a page title and returns a list of wiki-links on the page. The 
    list may contain duplicates and the position in the list is approximately 
    where the links occurred.
    
    page_title - a string with the title of the page on Wikipedia
    lang - a string (typically two letter ISO 639-1 code) for the language 
        edition, defaults to "en"
    redirects - 1 or 0 for whether to follow page redirects, defaults to 1
    
    Returns:
    outlinks_per_lang - a dictionary keyed by language returning a dictionary 
        keyed by page title returning a list of outlinks
    """
    
    # Replace spaces with underscores
    page_title = page_title.replace(' ','_')
    
    bad_titles = ['Special:','Wikipedia:','Help:','Template:','Category:','International Standard','Portal:','s:','File:','Digital object identifier','(page does not exist)']
    
    # Get the response from the API for a query
    # After passing a page title, the API returns the HTML markup of the current article version within a JSON payload
    req = requests.get('https://{2}.wikipedia.org/w/api.php?action=parse&format=json&page={0}&redirects={1}&prop=text&disableeditsection=1&disabletoc=1'.format(page_title,redirects,lang))
    
    # Read the response into JSON to parse and extract the HTML
    json_string = json.loads(req.text)
    
    # Initialize an empty list to store the links
    outlinks_list = [] 
    
    if 'parse' in json_string.keys():
        page_html = json_string['parse']['text']['*']

        # Parse the HTML into Beautiful Soup
        soup = BeautifulSoup(page_html,'lxml')
        
        # Remove sections at end
        bad_sections = ['See_also','Notes','References','Bibliography','External_links']
        sections = soup.find_all('h2')
        for section in sections:
            if section.span['id'] in bad_sections:
                
                # Clean out the divs
                div_siblings = section.find_next_siblings('div')
                for sibling in div_siblings:
                    sibling.clear()
                    
                # Clean out the ULs
                ul_siblings = section.find_next_siblings('ul')
                for sibling in ul_siblings:
                    sibling.clear()
        
        # Delete tags associated with templates
        for tag in soup.find_all('tr'):
            tag.replace_with('')

        # For each paragraph tag, extract the titles within the links
        for para in soup.find_all('p'):
            for link in para.find_all('a'):
                if link.has_attr('title'):
                    title = link['title']
                    # Ignore links that aren't interesting or are redlinks
                    if all(bad not in title for bad in bad_titles) and 'redlink' not in link['href']:
                        outlinks_list.append(title)

        # For each unordered list, extract the titles within the child links
        for unordered_list in soup.find_all('ul'):
            for item in unordered_list.find_all('li'):
                for link in item.find_all('a'):
                    if link.has_attr('title'):
                        title = link['title']
                        # Ignore links that aren't interesting or are redlinks
                        if all(bad not in title for bad in bad_titles) and 'redlink' not in link['href']:
                            outlinks_list.append(title)

    return outlinks_list

def get_pageviews(page_title,lang='en',date_from='20150701',date_to=str(datetime.today().date()).replace('-','')):
    """Takes Wikipedia page title and returns a all the various pageview records
    
    page_title - a string with the title of the page on Wikipedia
    lang - a string (typically two letter ISO 639-1 code) for the language edition,
        defaults to "en"
        datefrom - a date string in a YYYYMMDD format, defaults to 20150701
        dateto - a date string in a YYYYMMDD format, defaults to today
        
    Returns:
    revision_list - a DataFrame indexed by date and multi-columned by agent and access type
    """
    quoted_page_title = quote(page_title, safe='')
    
    df_list = []
    #for access in ['all-access','desktop','mobile-app','mobile-web']:
        #for agent in ['all-agents','user','spider','bot']:
    s = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{1}.wikipedia.org/{2}/{3}/{0}/daily/{4}/{5}".format(quoted_page_title,lang,'all-access','user',date_from,date_to)
    json_response = requests.get(s).json()
    if 'items' in json_response:
        df = pd.DataFrame(json_response['items'])
        df_list.append(df)

        concat_df = pd.concat(df_list)
        concat_df['timestamp'] = pd.to_datetime(concat_df['timestamp'],format='%Y%m%d%H')
        concat_df = concat_df.set_index(['timestamp','agent','access'])['views'].unstack([1,2]).sort_index(axis=1)
        concat_df[('page','page')] = page_title
        return concat_df
    else:
        print("Error on {0}".format(page_title))
        pass

Get all of the links from "Cannabis in California", and add the seed article itself.


In [ ]:
ca_links = get_page_outlinks('Cannabis in California') + ['Cannabis in California']

link_d = {}

for l in list(set(ca_links)):
    link_d[l] = get_page_outlinks(l)

Make a network object of these hyperlinks among articles.


In [ ]:
g = nx.DiGraph()
seed_edges = [('Cannabis in California',l) for l in link_d['Cannabis in California']]
#g.add_edges_from(seed_edges)
for page,links in link_d.items():
    for link in links:
        if link in link_d['Cannabis in California']:
            g.add_edge(page,link)

print("There are {0:,} nodes and {1:,} edges in the network.".format(g.number_of_nodes(),g.number_of_edges()))
nx.write_gexf(g,'wikilinks_cannabis_in_california.gexf')

Get the pageviews for the articles linking from "Cannabis in California".


In [ ]:
pvs_2016 = {}
pvs_2018 = {}

pvs_2016['Cannabis in California'] = get_pageviews('Cannabis in California',
                                                   date_from='20160801',date_to='20170201')
pvs_2018['Cannabis in California'] = get_pageviews('Cannabis in California',
                                                   date_from='20171001',date_to='20180301')

for page in list(set(ca_links)):
    pvs_2016[page] = get_pageviews(page,date_from='20160801',date_to='20170201')
    pvs_2018[page] = get_pageviews(page,date_from='20171001',date_to='20180301')

Define a function cleanup_pageviews to make a rectangular DataFrame with dates as index, pages as columns, and pageviews as values.


In [ ]:
def cleanup_pageviews(pv_dict):
    _df = pd.concat(pv_dict.values())
    _df.reset_index(inplace=True)
    _df.columns = _df.columns.droplevel(0)
    _df.columns = ['timestamp','pageviews','page']
    _df = _df.set_index(['timestamp','page']).unstack('page')['pageviews']
    return _df

cleanup_pageviews(pvs_2016).to_csv('wikipv_ca_2016.csv',encoding='utf8')
cleanup_pageviews(pvs_2018).to_csv('wikipv_ca_2018.csv',encoding='utf8')